##DDSAnalytics is an analytics company specializing in talent management solutions for Fortune 100 companies. Goal is to conduct analysis of existing employee data to highlight turnover

##Load Case Study 2 data set

#Read in Attrition Data
Attrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2_data.csv", header = TRUE)
NOAttrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Attrition.csv", header = TRUE)
AttritionNo = sqldf('
                    select *
                    from Attrition
                    where "Attrition" = "No"
                    ')
AttritionYes = sqldf('
                    select *
                    from Attrition
                    where "Attrition" = "Yes"
                    ')

Parse and Summarize data

Attrition %>%
  group_by(Attrition) %>%
  summarize(MeanCompanysWorked = mean(NumCompaniesWorked),
            MeanMonthlyIncome = mean(MonthlyIncome),
            MeanYearsAtCompany = mean(YearsAtCompany),
            MeanWorkAge = mean(Age),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanJobLevel = mean(JobLevel),
            MeanPerformance = mean(PerformanceRating),
            MeanYearsSincePromotion = mean(YearsInCurrentRole),
            MeanSalaryHike = mean(PercentSalaryHike),
            Total = n()) %>%
  arrange(Attrition,Total)
## # A tibble: 2 x 11
##   Attrition MeanCompanysWorked MeanMonthlyIncome MeanYearsAtCompany MeanWorkAge
##   <chr>                  <dbl>             <dbl>              <dbl>       <dbl>
## 1 No                      2.66             6702                7.30        37.4
## 2 Yes                     3.08             4765.               5.19        33.8
## # … with 6 more variables: MeanTotalWorkingYears <dbl>, MeanJobLevel <dbl>,
## #   MeanPerformance <dbl>, MeanYearsSincePromotion <dbl>, MeanSalaryHike <dbl>,
## #   Total <int>

Plot relationships

AttritionStats = Attrition %>%
  group_by(Attrition) %>%
  summarize(
            MeanAge = mean(Age),
            MeanMonthlyIncome = mean(MonthlyIncome),
            MeanWorkingYears = mean(TotalWorkingYears),
            MeanJobLevel = mean(JobLevel),
            MeanPerformance = mean(PerformanceRating),
            MeanYearsSincePromotion = mean(YearsInCurrentRole),
            MeanSalaryHike = mean(PercentSalaryHike),
            Total = n()) %>%
  arrange(Attrition,Total)
kable(AttritionStats,position = "left")
Attrition MeanAge MeanMonthlyIncome MeanWorkingYears MeanJobLevel MeanPerformance MeanYearsSincePromotion MeanSalaryHike Total
No 37.41233 6702.000 11.602740 2.116438 3.149315 4.453425 15.17534 730
Yes 33.78571 4764.786 8.185714 1.635714 3.164286 2.907143 15.32857 140
TravelStats = Attrition %>%
  group_by(Attrition,BusinessTravel) %>%
  summarize(TotalPop= n(),
            MeanWorkAge = mean(Age),
            MeanWorkLifeBalance = mean(WorkLifeBalance),
            MeanJobSatisfaction = mean(JobSatisfaction),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanMonthlyIncome = mean(MonthlyIncome),
            MeanYearsAtCompany = mean(YearsAtCompany),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanJobLevel = mean(JobLevel),
            MeanPerformance = mean(PerformanceRating),
            MeanYearsSincePromotion = mean(YearsInCurrentRole),
            MeanSalaryHike = mean(PercentSalaryHike),
            Total = n()) %>%
  arrange(Attrition,Total)
kable(TravelStats,position = "left")
Attrition BusinessTravel TotalPop MeanWorkAge MeanWorkLifeBalance MeanJobSatisfaction MeanTotalWorkingYears MeanMonthlyIncome MeanYearsAtCompany MeanJobLevel MeanPerformance MeanYearsSincePromotion MeanSalaryHike Total
No Non-Travel 83 37.54217 2.843374 2.795181 10.530121 5820.578 7.024096 1.963855 3.132530 4.156626 15.28916 83
No Travel_Frequently 123 37.92683 2.804878 2.959350 12.406504 6750.512 8.113821 2.186992 3.162602 4.829268 15.04878 123
No Travel_Rarely 524 37.27099 2.805344 2.709924 11.583970 6830.227 7.154580 2.124046 3.148855 4.412214 15.18702 524
Yes Non-Travel 11 31.81818 2.545454 1.909091 7.181818 5385.727 3.818182 1.909091 3.272727 2.272727 16.54545 11
Yes Travel_Frequently 35 32.48571 2.857143 2.714286 6.942857 3623.000 4.800000 1.400000 3.200000 2.514286 15.28571 35
Yes Travel_Rarely 94 34.50000 2.563830 2.393617 8.765957 5117.255 5.500000 1.691489 3.138298 3.127660 15.20213 94
JobRoleStats = Attrition %>%
  group_by(Attrition,JobRole) %>%
  summarize(TotalPop= n(),
            MeanWorkAge = mean(Age),
            MeanWorkLifeBalance = mean(WorkLifeBalance),
            MeanJobSatisfaction = mean(JobSatisfaction),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanMonthlyIncome = mean(MonthlyIncome),
            MeanYearsAtCompany = mean(YearsAtCompany),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanJobLevel = mean(JobLevel),
            MeanPerformance = mean(PerformanceRating),
            MeanYearsSincePromotion = mean(YearsInCurrentRole),
            MeanSalaryHike = mean(PercentSalaryHike),
            Total = n()) %>%
  arrange(Attrition,Total)
kable(JobRoleStats,position = "left")
Attrition JobRole TotalPop MeanWorkAge MeanWorkLifeBalance MeanJobSatisfaction MeanTotalWorkingYears MeanMonthlyIncome MeanYearsAtCompany MeanJobLevel MeanPerformance MeanYearsSincePromotion MeanSalaryHike Total
No Human Resources 21 35.71429 2.952381 2.714286 7.095238 3527.905 5.333333 1.285714 3.190476 3.2380952 15.57143 21
No Sales Representative 29 32.24138 2.758621 2.827586 5.275862 2849.207 3.379310 1.103448 3.103448 2.5862069 15.51724 29
No Manager 47 47.36170 2.765957 2.531915 24.808511 17163.362 13.446808 4.319149 3.212766 6.2765957 15.06383 47
No Research Director 50 43.74000 2.860000 2.480000 20.940000 15674.000 10.000000 3.860000 3.080000 5.6200000 14.96000 50
No Healthcare Representative 68 39.17647 2.676471 2.838235 13.397059 7323.176 8.338235 2.426471 3.147059 4.7058824 15.42647 68
No Manufacturing Director 85 38.05882 2.858824 2.741176 12.235294 7494.471 7.952941 2.458823 3.211765 5.4941176 15.75294 85
No Laboratory Technician 123 34.82114 2.861789 2.739837 8.268293 3310.496 5.821138 1.292683 3.154472 3.5934959 14.87805 123
No Research Scientist 140 34.97143 2.700000 2.900000 8.035714 3337.043 5.300000 1.214286 3.114286 3.4642857 15.25000 140
No Sales Executive 167 36.73653 2.880240 2.784431 11.149701 6802.311 7.706587 2.317365 3.149701 4.8982036 14.92216 167
Yes Research Director 1 41.00000 3.000000 3.000000 23.000000 19545.000 22.000000 5.000000 3.000000 15.0000000 12.00000 1
Yes Manufacturing Director 2 46.50000 2.500000 2.000000 19.500000 7962.000 5.500000 2.500000 3.000000 4.5000000 13.50000 2
Yes Manager 4 49.75000 2.750000 2.250000 23.250000 17594.250 17.750000 4.500000 3.000000 8.2500000 14.00000 4
Yes Human Resources 6 28.50000 3.000000 2.000000 2.666667 2433.167 2.000000 1.000000 3.000000 0.8333333 13.50000 6
Yes Healthcare Representative 8 39.87500 2.625000 2.750000 16.875000 8388.750 12.125000 2.750000 3.125000 5.6250000 14.25000 8
Yes Sales Representative 24 28.37500 3.000000 2.541667 3.416667 2415.542 2.375000 1.000000 3.083333 1.5416667 14.54167 24
Yes Laboratory Technician 30 31.30000 2.366667 2.466667 6.266667 2858.433 3.300000 1.166667 3.200000 2.1666667 15.63333 30
Yes Research Scientist 32 33.84375 2.656250 2.375000 6.468750 2919.344 4.281250 1.062500 3.312500 2.0312500 17.06250 32
Yes Sales Executive 33 36.48485 2.515151 2.424242 11.000000 7344.545 6.696970 2.424242 3.121212 4.0303030 14.90909 33
OvertimeStats = Attrition %>%
  group_by(Attrition,OverTime) %>%
  summarize(TotalPop= n(),
            MeanWorkAge = mean(Age),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanMonthlyIncome = mean(MonthlyIncome),
            MeanYearsAtCompany = mean(YearsAtCompany),
            MeanTotalWorkingYears = mean(TotalWorkingYears),
            MeanJobLevel = mean(JobLevel),
            MeanPerformance = mean(PerformanceRating),
            MeanYearsSincePromotion = mean(YearsInCurrentRole),
            MeanSalaryHike = mean(PercentSalaryHike),
            Total = n()) %>%
  arrange(Attrition,Total)
kable(OvertimeStats,position = "left")
Attrition OverTime TotalPop MeanWorkAge MeanTotalWorkingYears MeanMonthlyIncome MeanYearsAtCompany MeanJobLevel MeanPerformance MeanYearsSincePromotion MeanSalaryHike Total
No Yes 172 38.27907 12.087209 7000.343 7.860465 2.203488 3.116279 4.697674 14.94767 172
No No 558 37.14516 11.453405 6610.038 7.129032 2.089606 3.159498 4.378136 15.24552 558
Yes No 60 33.78333 8.866667 5110.067 6.300000 1.750000 3.133333 3.350000 14.81667 60
Yes Yes 80 33.78750 7.675000 4505.825 4.362500 1.550000 3.187500 2.575000 15.71250 80
Attrition %>% ggplot(aes(x = Attrition,y = TotalWorkingYears)) + 
  geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Years in Workforce")

Attrition %>% ggplot(aes(x = Attrition,y = MonthlyRate)) + 
  geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Monthly Rate") 

Attrition %>% ggplot(aes(x = Attrition,y = YearsInCurrentRole)) + 
  geom_boxplot(color = "blue", fill = "black") + ggtitle("Attrition based on Years in Current Role")

TravelStats %>% ggplot(aes(x = Attrition, y = TotalPop, fill=BusinessTravel)) + 
  geom_bar(stat="identity") + ggtitle("Attrition based on Travel")

TravelStats %>% 
  ggplot(aes(x = reorder(BusinessTravel,TotalPop), y = TotalPop, fill=Attrition)) + 
  geom_bar(stat="identity") + coord_flip() +
  ggtitle("Business Travel Turnover")  + xlab("Travel") + ylab("Total Population")

OvertimeStats %>% ggplot(aes(x = Attrition, y = TotalPop,fill = OverTime)) + geom_bar(stat="identity") + 
  ggtitle("Overtime vs No Overtime") 

Attrition %>% ggplot(aes(x=Age, y=MonthlyIncome)) + 
  geom_point(aes(x=Age, y=MonthlyIncome, color=Attrition))+
  ggtitle('Age vs Monthly Income') +
  scale_color_discrete(name="Attrition") + 
  facet_wrap(~Attrition, scales="free")

Attrition %>% ggplot(aes(x = reorder(JobRole,MonthlyIncome), y = MonthlyIncome,fill=JobRole))+
  geom_boxplot()+
  coord_flip()  +
  ggtitle('Job Role vs Monthly Income') +xlab("Job Role") + ylab("Monthly Income")

JobRoleStatsYes = JobRoleStats %>% filter(Attrition=="Yes") %>% arrange(-TotalPop)
JobRoleStatsNo = JobRoleStats %>% filter(Attrition=="No") %>% arrange(-TotalPop)

JobRoleStats %>% ggplot(aes(x = reorder(JobRole,MeanJobSatisfaction), y = MeanJobSatisfaction, fill=JobRole))+
  geom_boxplot()+
  coord_flip()  +
  ggtitle('Job Role vs Satisfaction') +xlab("Job Role") + ylab("Job Satisfaction")

JobRoleStats %>% ggplot(aes(x = reorder(JobRole,MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole))+
  geom_boxplot()+
  coord_flip()  +
  ggtitle('Job Role vs Work Life Balance') +xlab("Job Role") + ylab("Work Life Balance")

JobRoleStatsYes %>% ggplot(aes(x = reorder(JobRole,TotalPop), y = TotalPop, fill=JobRole)) + geom_bar(stat="identity") + 
  coord_flip() + ggtitle("Attrition by Job Role") + xlab("Job Role") + ylab("Total Population")

JobRoleStatsYes %>% ggplot(aes(x = reorder(JobRole,-MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole)) + geom_bar(stat="identity") + 
  coord_flip() + ggtitle("Work Life Balance by Job Role") + xlab("Job Role") + ylab("Total Population")

JobRoleStatsNo %>% ggplot(aes(x = reorder(JobRole,TotalPop), y = TotalPop, fill=JobRole)) + geom_bar(stat="identity") + 
  coord_flip() + ggtitle("Attrition by Job Role") + xlab("Job Role") + ylab("Total Population")

JobRoleStatsNo %>% ggplot(aes(x = reorder(JobRole,-MeanWorkLifeBalance), y = MeanWorkLifeBalance, fill=JobRole)) + geom_bar(stat="identity") + 
  coord_flip() + ggtitle("Work Life Balance by Job Role") + xlab("Job Role") + ylab("Total Population")

p <- plot_ly(Attrition, x = ~TotalWorkingYears, y = ~Age, z = ~MonthlyIncome, color = ~JobRole) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'WorkingYears'),
                      yaxis = list(title = 'Age'),
                      zaxis = list(title = 'MonthyIncome')))
p

#Perform Classification using KNN

#Model Data using KNN for Age and Monthly Income

  ##Plot Relationship Betwen Age and Monthly Income
Attrition %>% ggplot(aes(x = Age,y=MonthlyIncome,color = Attrition)) + geom_point() + xlab("Age") + geom_smooth(method="lm") + ylab("Monthly Income") + 
  ggtitle("Relationship Between Age and Monthly Income") +
  scale_y_continuous(label=comma)

#Split out training/test data - 70/30
set.seed(100)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]

accs = data.frame(accuracy = numeric(90), k = numeric(90))

#Formulate the optimal k value for KNN
for(i in 1:90)
{
  classifications = knn(train[,c(2,20)],test[,c(2,20)],train$Attrition, prob = TRUE, k = i)
  table(test$Attrition,classifications)
  CM = confusionMatrix(table(test$Attrition,classifications))
  accs$accuracy[i] = CM$overall[1]
  accs$k[i] = i
}
plot(accs$k,accs$accuracy, type = "l", xlab = "k") 
abline(v=accs$k[which.max(accs$accuracy)], col="red")

accs$k[which.max(accs$accuracy)]
## [1] 8
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]

classification = knn(Attrition[,c(2,20)],Attrition[,c(2,20)],Attrition$Attrition,prob = TRUE, k = 5)

table(classification,Attrition$Attrition)
##               
## classification  No Yes
##            No  717 118
##            Yes  13  22
confusionMatrix(table(classification,Attrition$Attrition))
## Confusion Matrix and Statistics
## 
##               
## classification  No Yes
##            No  717 118
##            Yes  13  22
##                                           
##                Accuracy : 0.8494          
##                  95% CI : (0.8239, 0.8725)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.2176          
##                                           
##                   Kappa : 0.1999          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9822          
##             Specificity : 0.1571          
##          Pos Pred Value : 0.8587          
##          Neg Pred Value : 0.6286          
##              Prevalence : 0.8391          
##          Detection Rate : 0.8241          
##    Detection Prevalence : 0.9598          
##       Balanced Accuracy : 0.5697          
##                                           
##        'Positive' Class : No              
## 

#Perform Classification using Random Forest

#Model Data with Random Forest
  ##Read in Data
Attrition = read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2_data.csv", header = TRUE)
NOAttrition= read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Attrition.csv", header = TRUE)

#Ensure Attrition is changed to factor
Attrition$Attrition =  factor(as.character(Attrition$Attrition), levels=c("Yes", "No"))

Attrition_Variables = randomForest(Attrition~ .-MonthlyIncome, 
                                   data=Attrition, ntree=1000, 
                                   keep.forest=FALSE,
                                   importance=TRUE)
varImpPlot(Attrition_Variables)

#Split test and train data - 70/30
set.seed(3033)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
train = Attrition[trainAttrition,]
test = Attrition[-trainAttrition,]

#Apply Random Forest using Monthly Income to test data
EmpAtt = randomForest(Attrition ~ .-Age, 
                      data=train, 
                      strata=train$Attrition, 
                      sampsize= c(60,60))

  #Use newly trained data set to predict test set
AttPredict = predict(EmpAtt, 
                     newdata= test)

  #Create confusion matrix to assess accuracy stats
confusionMatrix(AttPredict, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  25  30
##        No   11 195
##                                          
##                Accuracy : 0.8429         
##                  95% CI : (0.793, 0.8849)
##     No Information Rate : 0.8621         
##     P-Value [Acc > NIR] : 0.838542       
##                                          
##                   Kappa : 0.4593         
##                                          
##  Mcnemar's Test P-Value : 0.004937       
##                                          
##             Sensitivity : 0.69444        
##             Specificity : 0.86667        
##          Pos Pred Value : 0.45455        
##          Neg Pred Value : 0.94660        
##              Prevalence : 0.13793        
##          Detection Rate : 0.09579        
##    Detection Prevalence : 0.21073        
##       Balanced Accuracy : 0.78056        
##                                          
##        'Positive' Class : Yes            
## 
#Apply Random Forest to the output file
EmpAtt2 = randomForest(Attrition ~ .-Age, 
                       data=Attrition, 
                       strata=Attrition$Attrition, 
                       sampsize= c(60,60))

AttPredict2 = predict(EmpAtt2, 
                      newdata= NOAttrition)





EmpAttPreds = data.frame(NOAttrition$ID, AttPredict2)
#EmpAttPreds

#write.csv(EmpAttritionPreds, "/Users/Kevin/Desktop/School/Doing Data Science/Project 2/Case2PredictionsAlbrightAttrition.csv")

Linear Regression Model vs Random Forest for Predicting Salary

#Linear Regression Model 
NOSalary= read.csv("/Users/Kevin/Desktop/School/Doing Data Science/Project 2/CaseStudy2CompSet No Salary.csv", header = TRUE)

ggplot(data = Attrition, aes(x = Age, y = MonthlyIncome)) +
  geom_point() +
  stat_smooth(method = "lm", col = "red") +
  theme(panel.background = element_rect(fill = "white"),
        axis.line.x=element_line(),
        axis.line.y=element_line()) +
  ggtitle("Linear Model Fitted to Data") +
  scale_y_continuous(label=comma)

set.seed(100)
splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
trainSalary = Attrition[trainAttrition,]
testSalary = Attrition[-trainAttrition,]


fit_1 <- lm(MonthlyIncome ~ Age, 
            data = trainSalary)

summary(fit_1)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age, data = trainSalary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9430  -2720   -705   2035  12653 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2328.48     728.79  -3.195  0.00147 ** 
## Age           237.71      19.14  12.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4175 on 607 degrees of freedom
## Multiple R-squared:  0.2026, Adjusted R-squared:  0.2013 
## F-statistic: 154.2 on 1 and 607 DF,  p-value: < 2.2e-16
ggplot(data = Attrition, aes(x = Age, y = MonthlyIncome)) +
  geom_point() +
  stat_smooth(method = "lm", col = "red") +
  theme(panel.background = element_rect(fill = "white"),
        axis.line.x=element_line(),
        axis.line.y=element_line()) +
  ggtitle("Linear Model Fitted to Data") +
  scale_y_continuous(label=comma)

SalaryPreds = predict(fit_1, NOSalary)

RMSE(Attrition$MonthlyIncome, SalaryPreds)
## [1] 5008.04
#data.frame(NOSalary$ID,SalaryPreds)


#Random Forest Model
  #Train Data

splitPerc = .70
trainAttrition= sample(1:dim(Attrition)[1],round(splitPerc * dim(Attrition)[1]))
trainSalary = Attrition[trainAttrition,]
testSalary = Attrition[-trainAttrition,]
str(trainSalary)
## 'data.frame':    609 obs. of  36 variables:
##  $ ID                      : int  395 530 124 478 596 744 619 258 867 302 ...
##  $ Age                     : int  42 56 38 26 33 23 39 30 32 35 ...
##  $ Attrition               : Factor w/ 2 levels "Yes","No": 2 1 2 2 2 2 2 2 2 2 ...
##  $ BusinessTravel          : chr  "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
##  $ DailyRate               : int  1271 441 243 775 586 160 613 855 976 853 ...
##  $ Department              : chr  "Research & Development" "Research & Development" "Sales" "Sales" ...
##  $ DistanceFromHome        : int  2 14 7 29 1 4 6 7 26 18 ...
##  $ Education               : int  1 4 4 2 3 1 1 4 4 5 ...
##  $ EducationField          : chr  "Medical" "Life Sciences" "Marketing" "Medical" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  875 161 709 618 855 1735 2062 1428 333 74 ...
##  $ EnvironmentSatisfaction : int  2 2 4 1 1 3 4 4 3 2 ...
##  $ Gender                  : chr  "Male" "Female" "Female" "Male" ...
##  $ HourlyRate              : int  35 72 46 45 48 51 42 73 100 71 ...
##  $ JobInvolvement          : int  3 3 2 3 4 3 2 3 3 3 ...
##  $ JobLevel                : int  1 1 2 2 2 1 3 2 2 3 ...
##  $ JobRole                 : chr  "Research Scientist" "Research Scientist" "Sales Executive" "Sales Executive" ...
##  $ JobSatisfaction         : int  4 2 4 3 1 2 1 1 4 1 ...
##  $ MaritalStatus           : chr  "Single" "Married" "Single" "Divorced" ...
##  $ MonthlyIncome           : int  2515 4963 4028 4306 4037 3295 9991 4779 4465 9069 ...
##  $ MonthlyRate             : int  9068 4510 7791 4267 21816 12862 21457 12761 12069 11031 ...
##  $ NumCompaniesWorked      : int  5 9 0 5 1 1 4 7 0 1 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "Yes" "Yes" "No" "No" ...
##  $ PercentSalaryHike       : int  14 18 20 12 22 13 15 14 18 22 ...
##  $ PerformanceRating       : int  3 3 4 3 4 3 3 3 3 4 ...
##  $ RelationshipSatisfaction: int  4 1 1 1 1 3 1 2 1 4 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 3 0 2 1 0 1 2 0 1 ...
##  $ TotalWorkingYears       : int  8 7 8 8 9 3 9 8 4 9 ...
##  $ TrainingTimesLastYear   : int  2 2 2 5 5 3 5 3 2 3 ...
##  $ WorkLifeBalance         : int  3 3 3 3 3 1 3 3 3 2 ...
##  $ YearsAtCompany          : int  2 5 7 0 9 3 7 3 3 9 ...
##  $ YearsInCurrentRole      : int  1 4 7 0 8 2 7 2 2 8 ...
##  $ YearsSinceLastPromotion : int  2 4 0 0 0 1 1 0 2 1 ...
##  $ YearsWithCurrManager    : int  2 3 5 0 8 2 7 2 2 8 ...
EmpRandoSalary = randomForest(MonthlyIncome ~ .-Age, 
                              data=trainSalary)

PredictSalary = predict(EmpRandoSalary, 
                          newdata= testSalary)
  #Test Data
RMSE(testSalary$MonthlyIncome, PredictSalary)
## [1] 1263.272
PredDF = data.frame(MonthlyIncome=testSalary$MonthlyIncome, PredictSalary)
PredDF %>% ggplot(aes(x=MonthlyIncome,y=PredictSalary)) + geom_point(aes(x=MonthlyIncome,y=PredictSalary)) + 
  ggtitle("Relationship Between Predicted and Actuals") + xlab("Actual") + ylab("Predicted") + geom_smooth(method="lm") 

#data.frame(testSalary$ID, PredictSalary)

#RMSE of Random Forest model for predicting salary
NOSalary$Attrition =  factor(as.character(NOSalary$Attrition), levels=c("Yes", "No"))
SalaryRF <- randomForest(MonthlyIncome ~ .-Age, data=trainSalary)
#importance(SalaryRF)
#varImpPlot(SalaryRF)

#Test on provided salary data set
#str(NOSalary)
#str(Attrition)
PredictSalary2<- predict(SalaryRF, 
                         newdata= NOSalary,
                         importance=TRUE,ntree=500)
summary(PredictSalary2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2089    3222    5278    6232    6284   18023
PredictionsAlbright <- data.frame(NOSalary$ID, PredictSalary2)
write.csv(PredictionsAlbright,"/Users/Kevin/Desktop/School/Doing Data Science/Project 2/AlbrightSalaryPredictions.csv")